library(corrplot)
## corrplot 0.84 loaded
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Jordan

# save the data separately
jh_data <- read.csv(file = 'NBA_salary_stats.csv', row.names = 1, header = TRUE)

# Correlation plot (Khoi's code)
corrplot(cor(jh_data[, -c(1, 3, 4, 30, 38, 45)],
# corrplot(cor(jh_data[c(2,10)],
        use = 'complete.obs'),
        method = 'circle',
        type = 'upper')

# Just correlation of all vars with y
correlations<-cor(jh_data[, -c(1, 3, 4, 30, 38, 45)])[ ,1]

# save the highly correlated variables (above .5 is my threshold)
high_cor <- Filter(function(x) x>.5,correlations)

# remove the y variable
high_cor<-high_cor[2:length(high_cor)]

# Loop through all highly correlated vars to get regressions and plots
for (c in names(high_cor)) {
  # get the model for each highly correlated predictor
  l<-lm(jh_data$Actual~jh_data[[c]])
  
  # plot the model with the regression line
  plot(jh_data$Actual~jh_data[[c]], main=c)
  abline(l,col='red')
  
  # residual plots
  # these residual plots look really good. The only note is that the higher end errors are a little bigger making the center above the clear mass of residuals in most plots.
  plot(residuals(l), main = c)
  abline(h=0)
}

# see chap 8 and chap 3
for (c in 1:ncol(jh_data)) {
  (is.numeric(jh_data[c]))
  }
### Khoi

# exploratory data analysis
## reading in data
kt_data <- read.csv(file = 'NBA_salary_stats.csv', 
                    row.names = 1, 
                    header = TRUE)

# drop Guaranteed salary in exchange for using 'X2019.20' (actual pay) when predictingsalary (index = 3rd column)
### correlation plots, 2019-20 season performance stats
## correlation plots for total point scoring
corrplot(cor(subset(kt_data, select = c('Actual', 'MP', 'G', 'GS', 'X2P', 'X3P', 'FT', 
                                        'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF')),
             use = 'complete.obs'),
         method = 'circle',
         type = 'upper')

## correlation plots with accuracy (e.g. 3-point percentage)
corrplot(cor(subset(kt_data, select = c('Actual', 'MP', 'G', 'GS.', 'X2P.', 'X3P.', 'FT.', 
                                        'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF')),
             use = 'complete.obs'),
         method = 'circle',
         type = 'upper')

### correlation plots, overall career statistics
# e.g. mid-season trades, experience, age, etc.
corrplot(cor(subset(kt_data, select = c('Actual', 'yrs_exp', 'age_yrs', 'draft_num', 
                                        'num_accolades', 'NBA_titles', 'all_NBA', 'num_teams_1920', 
                                        'mid_season_trades', 'num_yrs_current_team', 
                                        'num_teams_played', 'injury_yrs', 'twitter_followers')), 
             use = 'complete.obs'),
         method = 'circle',
         type = 'upper')

## list variables most correlated to actual salary
cor(subset(kt_data, select = c('Actual', 'MP', 'G', 'GS', 'X2P', 'X3P', 'FT', 
                               'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF')))[,1]
##    Actual        MP         G        GS       X2P       X3P        FT       ORB 
## 1.0000000 0.4776307 0.2580372 0.5198420 0.5622481 0.3826085 0.5813015 0.3301995 
##       DRB       AST       STL       BLK       TOV        PF 
## 0.5018673 0.5243857 0.4094649 0.3252348 0.5551408 0.3668231
# GS, X2P, FT, DRB, AST, TOV are > 0.5
# MP, X3P, ORB, STL, BLK, PF > 0.3
cor(subset(kt_data, select = c('Actual', 'MP', 'G', 'GS.', 'X2P.', 'X3P.', 'FT.', 
                               'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF')))[,1]
##     Actual         MP          G        GS.       X2P.       X3P.        FT. 
## 1.00000000 0.47763072 0.25803717 0.55611939 0.05032016 0.11237290 0.11830423 
##        ORB        DRB        AST        STL        BLK        TOV         PF 
## 0.33019950 0.50186730 0.52438573 0.40946493 0.32523475 0.55514079 0.36682313
# GS., DRB, AST, TOV are > 0.5
# MP, ORB, STL, BLK, PF > 0.3
cor(subset(kt_data, select = c('Actual', 'yrs_exp', 'age_yrs', 'draft_num', 
                               'num_accolades', 'NBA_titles', 'all_NBA', 'num_teams_1920', 
                               'mid_season_trades', 'num_yrs_current_team', 
                               'num_teams_played', 'injury_yrs', 'twitter_followers')))[,1]
##               Actual              yrs_exp              age_yrs 
##           1.00000000           0.50866761           0.37333296 
##            draft_num        num_accolades           NBA_titles 
##          -0.09161885           0.58950054           0.12658476 
##              all_NBA       num_teams_1920    mid_season_trades 
##           0.47315504           0.01393671           0.01393671 
## num_yrs_current_team     num_teams_played           injury_yrs 
##           0.34030605           0.06716042           0.16221740 
##    twitter_followers 
##           0.30122396
# yrs_exp, num_accolades are 0.5
# age_yrs, all_NBA, num_yrs_current_team, twitter_followers > 0.3
# Jordan
# automated search procedures

# !! Do we want to split the data and do crosswise regression?
##intercept only model
regnull <- lm(jh_data$Actual~1, data=jh_data)

##model with all predictors
regfull <- lm(Actual~.-Player-Guaranteed-url, data=jh_data)

# step_reg <- step(regnull, scope=list(lower=regnull, upper=regfull), direction="both")
# model evaluation see chapters 5 and 7
library(faraway)
## 
## Attaching package: 'faraway'
## The following object is masked from 'package:GGally':
## 
##     happy
jh_result<-lm(Actual ~ num_accolades + PTS + yrs_exp + num_teams_played + 
    GS. + NBA_titles + num_teams_1920 + PF + DRB + GS + eFG. + 
    all_NBA, data = jh_data)

summary(jh_result)
## 
## Call:
## lm(formula = Actual ~ num_accolades + PTS + yrs_exp + num_teams_played + 
##     GS. + NBA_titles + num_teams_1920 + PF + DRB + GS + eFG. + 
##     all_NBA, data = jh_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -24942709  -2311847    -36118   2071513  18876940 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1757364.4   713656.5  -2.462 0.014173 *  
## num_accolades      704532.2   294578.3   2.392 0.017183 *  
## PTS                  5865.4      967.9   6.060 2.89e-09 ***
## yrs_exp           1249579.0   100419.8  12.444  < 2e-16 ***
## num_teams_played -1544257.3   201325.6  -7.670 1.07e-13 ***
## GS.               8659338.7  1696598.0   5.104 4.92e-07 ***
## NBA_titles       -1719365.4   542477.4  -3.169 0.001632 ** 
## num_teams_1920    3667862.2  1055950.9   3.474 0.000564 ***
## PF                 -25757.8     7395.8  -3.483 0.000545 ***
## DRB                  9673.4     3450.7   2.803 0.005277 ** 
## GS                 -74325.6    31020.7  -2.396 0.016983 *  
## eFG.             -3911371.9  2049989.6  -1.908 0.057029 .  
## all_NBA            447025.3   288252.4   1.551 0.121651    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5124000 on 449 degrees of freedom
## Multiple R-squared:  0.6633, Adjusted R-squared:  0.6543 
## F-statistic: 73.71 on 12 and 449 DF,  p-value: < 2.2e-16
anova(jh_result)
## Analysis of Variance Table
## 
## Response: Actual
##                   Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## num_accolades      1 1.2165e+16 1.2165e+16 463.4195 < 2.2e-16 ***
## PTS                1 4.7451e+15 4.7451e+15 180.7565 < 2.2e-16 ***
## yrs_exp            1 2.1762e+15 2.1762e+15  82.8976 < 2.2e-16 ***
## num_teams_played   1 2.0693e+15 2.0693e+15  78.8283 < 2.2e-16 ***
## GS.                1 6.1726e+14 6.1726e+14  23.5139 1.712e-06 ***
## NBA_titles         1 4.0733e+14 4.0733e+14  15.5165 9.480e-05 ***
## num_teams_1920     1 2.5231e+14 2.5231e+14   9.6115 0.0020552 ** 
## PF                 1 3.9256e+14 3.9256e+14  14.9540 0.0001264 ***
## DRB                1 1.1701e+14 1.1701e+14   4.4573 0.0353047 *  
## GS                 1 1.1810e+14 1.1810e+14   4.4987 0.0344673 *  
## eFG.               1 9.6619e+13 9.6619e+13   3.6806 0.0556839 .  
## all_NBA            1 6.3134e+13 6.3134e+13   2.4050 0.1216513    
## Residuals        449 1.1787e+16 2.6251e+13                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif(jh_result)
##    num_accolades              PTS          yrs_exp num_teams_played 
##         3.774549         3.650018         3.014659         2.590077 
##              GS.       NBA_titles   num_teams_1920               PF 
##         8.291112         1.376388         8.998956         4.614315 
##              DRB               GS             eFG.          all_NBA 
##         4.463431        12.166038         9.453790         2.388246
# multicolinearity appears to be an issue with GS and potentially eFG., num_teams_1920, and GS.
#correlation between variables?

vcov(jh_result)[,c('GS','eFG.',"num_teams_1920","GS.")]
##                            GS          eFG. num_teams_1920           GS.
## (Intercept)        9763583887  1.860193e+11  -2.540477e+11 -632128174737
## num_accolades      1608937133  1.932269e+10  -8.122633e+09  -98031547225
## PTS                  -7837783 -1.226421e+07  -6.260376e+05     121083911
## yrs_exp             160994678 -1.740717e+09   6.798448e+09  -16301658003
## num_teams_played   -483040741 -2.024686e+10  -1.268261e+10   48870659815
## GS.              -45352232117 -3.379549e+11   3.368037e+11 2878444861631
## NBA_titles        -1856407142 -3.233093e+10   1.360718e+10   85831133768
## num_teams_1920    -5417073792 -1.984803e+12   1.115032e+12  336803745321
## PF                  -51900343 -2.427541e+09   5.363480e+08    1662852170
## DRB                 -24464050 -8.514083e+08   3.845955e+08     855183532
## GS                  962283269  8.131469e+09  -5.417074e+09  -45352232117
## eFG.               8131469194  4.202458e+12  -1.984803e+12 -337954901794
## all_NBA            -275278304  3.978419e+09  -1.311960e+09   30661898053

From the VCOV table, looking at rows specifically: GS -

# let's try dropping GS (highest VIF) and eFG. (also a major one) and see how things change
jh_result.2<-lm(Actual ~ num_accolades + PTS + yrs_exp + num_teams_played + 
    GS. + NBA_titles + num_teams_1920 + PF + DRB + 
    all_NBA, data = jh_data)
summary(jh_result.2)
## 
## Call:
## lm(formula = Actual ~ num_accolades + PTS + yrs_exp + num_teams_played + 
##     GS. + NBA_titles + num_teams_1920 + PF + DRB + all_NBA, data = jh_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -24725614  -2278176    -64771   1993189  18498666 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -921017.9   642952.5  -1.432 0.152699    
## num_accolades      833353.3   291996.3   2.854 0.004516 ** 
## PTS                  5302.9      940.5   5.638 3.04e-08 ***
## yrs_exp           1259550.9   100977.3  12.474  < 2e-16 ***
## num_teams_played -1594406.2   201971.7  -7.894 2.24e-14 ***
## GS.               5192350.9   866562.5   5.992 4.25e-09 ***
## NBA_titles       -1875971.8   542885.4  -3.456 0.000601 ***
## num_teams_1920    1709343.9   421254.8   4.058 5.84e-05 ***
## PF                 -31346.3     7187.4  -4.361 1.60e-05 ***
## DRB                  7271.8     3367.9   2.159 0.031363 *  
## all_NBA            430770.6   290119.8   1.485 0.138295    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5160000 on 451 degrees of freedom
## Multiple R-squared:  0.657,  Adjusted R-squared:  0.6494 
## F-statistic:  86.4 on 10 and 451 DF,  p-value: < 2.2e-16
# Adj R2 is about the same
anova(jh_result.2)
## Analysis of Variance Table
## 
## Response: Actual
##                   Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## num_accolades      1 1.2165e+16 1.2165e+16 456.9866 < 2.2e-16 ***
## PTS                1 4.7451e+15 4.7451e+15 178.2474 < 2.2e-16 ***
## yrs_exp            1 2.1762e+15 2.1762e+15  81.7469 < 2.2e-16 ***
## num_teams_played   1 2.0693e+15 2.0693e+15  77.7341 < 2.2e-16 ***
## GS.                1 6.1726e+14 6.1726e+14  23.1874 2.009e-06 ***
## NBA_titles         1 4.0733e+14 4.0733e+14  15.3011 0.0001058 ***
## num_teams_1920     1 2.5231e+14 2.5231e+14   9.4781 0.0022062 ** 
## PF                 1 3.9256e+14 3.9256e+14  14.7465 0.0001406 ***
## DRB                1 1.1701e+14 1.1701e+14   4.3954 0.0365922 *  
## all_NBA            1 5.8689e+13 5.8689e+13   2.2046 0.1382947    
## Residuals        451 1.2006e+16 2.6621e+13                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vif(jh_result.2)
##    num_accolades              PTS          yrs_exp num_teams_played 
##         3.657189         3.398566         3.005911         2.570543 
##              GS.       NBA_titles   num_teams_1920               PF 
##         2.132965         1.359324         1.412289         4.297441 
##              DRB          all_NBA 
##         4.192612         2.385706
# much better VIF
#partial f test, if this has p>0.05 we reject null and keep full model.
# remove all_NBA
reduced.1 <- lm(Actual~num_accolades + PTS + yrs_exp + num_teams_played + 
    GS. + NBA_titles + num_teams_1920 + PF + DRB, data = jh_data)
anova(reduced.1,jh_result.2) 
## Analysis of Variance Table
## 
## Model 1: Actual ~ num_accolades + PTS + yrs_exp + num_teams_played + GS. + 
##     NBA_titles + num_teams_1920 + PF + DRB
## Model 2: Actual ~ num_accolades + PTS + yrs_exp + num_teams_played + GS. + 
##     NBA_titles + num_teams_1920 + PF + DRB + all_NBA
##   Res.Df        RSS Df  Sum of Sq      F Pr(>F)
## 1    452 1.2065e+16                            
## 2    451 1.2006e+16  1 5.8689e+13 2.2046 0.1383
summary(reduced.1)
## 
## Call:
## lm(formula = Actual ~ num_accolades + PTS + yrs_exp + num_teams_played + 
##     GS. + NBA_titles + num_teams_1920 + PF + DRB, data = jh_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -23147747  -2217941      1273   2103629  18287198 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -972505.9   642871.7  -1.513 0.131041    
## num_accolades     1114547.8   222548.9   5.008 7.90e-07 ***
## PTS                  5339.4      941.5   5.671 2.53e-08 ***
## yrs_exp           1266141.9   101014.0  12.534  < 2e-16 ***
## num_teams_played -1613655.0   201823.6  -7.995 1.09e-14 ***
## GS.               5100909.3   865522.4   5.893 7.40e-09 ***
## NBA_titles       -1963533.7   540391.6  -3.634 0.000312 ***
## num_teams_1920    1708847.2   421815.6   4.051 6.00e-05 ***
## PF                 -31102.9     7195.1  -4.323 1.90e-05 ***
## DRB                  7054.1     3369.1   2.094 0.036840 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5166000 on 452 degrees of freedom
## Multiple R-squared:  0.6554, Adjusted R-squared:  0.6485 
## F-statistic:  95.5 on 9 and 452 DF,  p-value: < 2.2e-16
# we fail to reject the null so let's go with the smaller model
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
# Do we need to transform predictor? If lambda=1 is in the confidence interval we do not need a transformation, if 0 is in CI then us ln(y), otherwise use y^lambda (see pg 17 in notes)
boxcox(jh_result)

# check distribution of residuals
plot(residuals(jh_result), main = c)
abline(h=0,col='red')

# check normal distribution
qqnorm(jh_result$residuals)
qqline(jh_result$residuals, col="red")

# autocorrelation
acf(jh_result$residuals, main="ACF of Residuals")

Questions: 1. Is it a problem if we don’t have categorical variables? 2. VIF suggests we have some multicolinearity, do we drop those with high VIF’s? 3. Do we want to error on the side of more predictors (less bias) or fewer predictors (less variance)? 4. Should we do cross validation? 5. Should we check AIC/BIC/Press statistic (43) 6. Test predictive ability? 7. Should we try to get better linear fits by dealing with outliers? 8. (partial regression plot, pg. 48, module 8)

jh_data$Pos<-factor(jh_data$Pos)
is.factor(jh_data$Pos)
## [1] TRUE
levels(jh_data$Pos)
##  [1] "C"     "C-PF"  "PF"    "PF-C"  "PF-SF" "PG"    "PG-SG" "SF"    "SF-C" 
## [10] "SF-PF" "SF-SG" "SG"    "SG-PG" "SG-SF"
contrasts(jh_data$Pos)
##       C-PF PF PF-C PF-SF PG PG-SG SF SF-C SF-PF SF-SG SG SG-PG SG-SF
## C        0  0    0     0  0     0  0    0     0     0  0     0     0
## C-PF     1  0    0     0  0     0  0    0     0     0  0     0     0
## PF       0  1    0     0  0     0  0    0     0     0  0     0     0
## PF-C     0  0    1     0  0     0  0    0     0     0  0     0     0
## PF-SF    0  0    0     1  0     0  0    0     0     0  0     0     0
## PG       0  0    0     0  1     0  0    0     0     0  0     0     0
## PG-SG    0  0    0     0  0     1  0    0     0     0  0     0     0
## SF       0  0    0     0  0     0  1    0     0     0  0     0     0
## SF-C     0  0    0     0  0     0  0    1     0     0  0     0     0
## SF-PF    0  0    0     0  0     0  0    0     1     0  0     0     0
## SF-SG    0  0    0     0  0     0  0    0     0     1  0     0     0
## SG       0  0    0     0  0     0  0    0     0     0  1     0     0
## SG-PG    0  0    0     0  0     0  0    0     0     0  0     1     0
## SG-SF    0  0    0     0  0     0  0    0     0     0  0     0     1
a1<-subset(jh_data,Pos=="C")
a2<-subset(jh_data,Pos=="C-PF")
a3<-subset(jh_data,Pos=="PF")
a4<-subset(jh_data,Pos=="PF-C")
a5<-subset(jh_data,Pos=="PF-SF")
a6<-subset(jh_data,Pos=="PG")
a7<-subset(jh_data,Pos=="PG-SG")
a8<-subset(jh_data,Pos=="SF")
a9<-subset(jh_data,Pos=="SF-C")
a10<-subset(jh_data,Pos=="SF-PF")
a11<-subset(jh_data,Pos=="SF-SG")
a12<-subset(jh_data,Pos=="SG")
a13<-subset(jh_data,Pos=="SG-PG")
a14<-subset(jh_data,Pos=="SG-SF")


reg1<-lm(Guaranteed~PTS,data=a1)
reg2<-lm(Guaranteed~PTS,data=a2)
reg3<-lm(Guaranteed~PTS,data=a3)
reg4<-lm(Guaranteed~PTS,data=a4)
reg5<-lm(Guaranteed~PTS,data=a5)
reg6<-lm(Guaranteed~PTS,data=a6)
reg7<-lm(Guaranteed~PTS,data=a7)
reg8<-lm(Guaranteed~PTS,data=a8)
reg9<-lm(Guaranteed~PTS,data=a9)
reg10<-lm(Guaranteed~PTS,data=a10)
reg11<-lm(Guaranteed~PTS,data=a11)
reg12<-lm(Guaranteed~PTS,data=a12)
reg13<-lm(Guaranteed~PTS,data=a13)
reg14<-lm(Guaranteed~PTS,data=a14)

plot(jh_data$PTS,jh_data$Guaranteed, main="Guaranteed against PTS, by Pos")
points(a2$PTS,a2$Guaranteed, pch=2, col="red")
points(a3$PTS,a3$Guaranteed, pch=3, col="blue")
points(a4$PTS,a4$Guaranteed, pch=4, col="green")
points(a5$PTS,a5$Guaranteed, pch=5, col="orange")
points(a6$PTS,a6$Guaranteed, pch=6, col="purple")
points(a7$PTS,a7$Guaranteed, pch=7, col="pink")
points(a8$PTS,a8$Guaranteed, pch=8, col="seagreen2")
points(a9$PTS,a9$Guaranteed, pch=9, col="salmon3")
points(a10$PTS,a10$Guaranteed, pch=10, col="royalblue4")
points(a11$PTS,a11$Guaranteed, pch=11, col="rosybrown4")
points(a12$PTS,a12$Guaranteed, pch=12, col="skyblue2")
points(a13$PTS,a13$Guaranteed, pch=13, col="slateblue2")
points(a14$PTS,a14$Guaranteed, pch=14, col="slategray2")

abline(reg1,lty=1)
# abline(reg2,lty=2, col="red")
abline(reg3,lty=3, col="blue")
abline(reg4,lty=4, col="green")
abline(reg5,lty=5, col="orange")
abline(reg6,lty=6, col="purple")
# abline(reg7,lty=7, col="pink")
abline(reg8,lty=8, col="seagreen2")
# abline(reg9,lty=9, col="salmon3")
abline(reg10,lty=10, col="royalblue4")
# abline(reg11,lty=11, col="rosybrown4")
abline(reg12,lty=12, col="skyblue2")
abline(reg13,lty=13, col="slateblue2")

# abline(reg14,lty=14, col="slategray2")